Libraries
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(skimr)
library(png)
library(grid)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Load data
load_csv <- function(csv_name, data_folder = 'data') {
df <- read.csv(file.path(data_folder, csv_name))
as_tibble(df)
}
inventories_df <- load_csv('inventories.csv')
inventory_sets_df <- load_csv('inventory_sets.csv')
sets_df <- load_csv('sets.csv')
themes_df <- load_csv('themes.csv')
inventory_minifigs_df <- load_csv('inventory_minifigs.csv')
minifigs_df <- load_csv('minifigs.csv')
inventory_parts_df <- load_csv('inventory_parts.csv')
colors_df <- load_csv('colors.csv')
parts_df <- load_csv('parts.csv')
elements_df <- load_csv('elements.csv')
part_categories_df <- load_csv('part_categories.csv')
part_relationships_df <- load_csv('part_relationships.csv')
Basic tables analysis
Data schema

Tables
Inventories
Data summary
| 1 |
1 |
7922-1 |
| 3 |
1 |
3931-1 |
| 4 |
1 |
6942-1 |
| 15 |
1 |
5158-1 |
| 16 |
1 |
903-1 |
| 17 |
1 |
850950-1 |
| Name |
inventories_df |
| Number of rows |
37265 |
| Number of columns |
3 |
| _______________________ |
|
| Column type frequency: |
|
| character |
1 |
| numeric |
2 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| set_num |
0 |
1 |
3 |
20 |
0 |
35644 |
0 |
Variable type: numeric
| id |
0 |
1 |
61103.60 |
51380.10 |
1 |
14424 |
54379 |
88842 |
194312 |
▇▆▂▂▂ |
| version |
0 |
1 |
1.09 |
0.58 |
1 |
1 |
1 |
1 |
16 |
▇▁▁▁▁ |
Inventory sets
Data summary
| 35 |
75911-1 |
1 |
| 35 |
75912-1 |
1 |
| 39 |
75048-1 |
1 |
| 39 |
75053-1 |
1 |
| 50 |
4515-1 |
1 |
| 50 |
4520-1 |
2 |
| Name |
inventory_sets_df |
| Number of rows |
4358 |
| Number of columns |
3 |
| _______________________ |
|
| Column type frequency: |
|
| character |
1 |
| numeric |
2 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| set_num |
0 |
1 |
5 |
20 |
0 |
3171 |
0 |
Variable type: numeric
| inventory_id |
0 |
1 |
52518.95 |
59063.13 |
35 |
8076 |
16423 |
98685 |
191576 |
▇▁▁▂▁ |
| quantity |
0 |
1 |
1.81 |
5.67 |
1 |
1 |
1 |
1 |
60 |
▇▁▁▁▁ |
Sets
| Name |
sets_df |
| Number of rows |
21880 |
| Number of columns |
6 |
| _______________________ |
|
| Column type frequency: |
|
| character |
3 |
| numeric |
3 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| set_num |
0 |
1 |
3 |
20 |
0 |
21880 |
0 |
| name |
0 |
1 |
2 |
93 |
0 |
18752 |
0 |
| img_url |
0 |
1 |
46 |
63 |
0 |
21880 |
0 |
Variable type: numeric
| year |
0 |
1 |
2007.76 |
13.96 |
1949 |
2001 |
2012 |
2018 |
2024 |
▁▁▁▃▇ |
| theme_id |
0 |
1 |
441.97 |
215.53 |
1 |
273 |
497 |
608 |
752 |
▃▃▃▇▇ |
| num_parts |
0 |
1 |
161.38 |
418.14 |
0 |
3 |
31 |
139 |
11695 |
▇▁▁▁▁ |
Themes
Data summary
| 1 |
Technic |
NA |
| 3 |
Competition |
1 |
| 4 |
Expert Builder |
1 |
| 16 |
RoboRiders |
1 |
| 17 |
Speed Slammers |
1 |
| 18 |
Star Wars |
1 |
| Name |
themes_df |
| Number of rows |
468 |
| Number of columns |
3 |
| _______________________ |
|
| Column type frequency: |
|
| character |
1 |
| numeric |
2 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
Variable type: numeric
| id |
0 |
1.00 |
433.46 |
216.55 |
1 |
250.5 |
466 |
625.25 |
752 |
▅▅▅▆▇ |
| parent_id |
145 |
0.69 |
360.64 |
197.19 |
1 |
186.0 |
411 |
512.50 |
697 |
▅▃▂▇▂ |
Inventory Minifigs
Data summary
| 3 |
fig-001549 |
1 |
| 4 |
fig-000764 |
1 |
| 19 |
fig-000555 |
1 |
| 25 |
fig-000574 |
1 |
| 26 |
fig-000842 |
1 |
| 26 |
fig-008641 |
1 |
| Name |
inventory_minifigs_df |
| Number of rows |
20858 |
| Number of columns |
3 |
| _______________________ |
|
| Column type frequency: |
|
| character |
1 |
| numeric |
2 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| fig_num |
0 |
1 |
10 |
10 |
0 |
13455 |
0 |
Variable type: numeric
| inventory_id |
0 |
1 |
43010.44 |
52256.78 |
3 |
7869 |
15681 |
66834 |
194312 |
▇▁▁▁▁ |
| quantity |
0 |
1 |
1.06 |
0.78 |
1 |
1 |
1 |
1 |
100 |
▇▁▁▁▁ |
Minifigs
| Name |
minifigs_df |
| Number of rows |
13764 |
| Number of columns |
4 |
| _______________________ |
|
| Column type frequency: |
|
| character |
3 |
| numeric |
1 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| fig_num |
0 |
1 |
10 |
10 |
0 |
13764 |
0 |
| name |
0 |
1 |
1 |
148 |
0 |
13354 |
0 |
| img_url |
0 |
1 |
53 |
53 |
0 |
13764 |
0 |
Variable type: numeric
| num_parts |
0 |
1 |
5.3 |
6.03 |
0 |
4 |
4 |
5 |
156 |
▇▁▁▁▁ |
Inventory parts
| Name |
inventory_parts_df |
| Number of rows |
1180987 |
| Number of columns |
6 |
| _______________________ |
|
| Column type frequency: |
|
| character |
3 |
| numeric |
3 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| part_num |
0 |
1 |
1 |
20 |
0 |
51051 |
0 |
| is_spare |
0 |
1 |
1 |
1 |
0 |
2 |
0 |
| img_url |
0 |
1 |
0 |
117 |
8180 |
74266 |
0 |
Variable type: numeric
| inventory_id |
0 |
1 |
50849.46 |
55136.94 |
1 |
9404 |
22838 |
87088 |
194312 |
▇▂▁▂▁ |
| color_id |
0 |
1 |
131.78 |
862.38 |
-1 |
4 |
15 |
71 |
9999 |
▇▁▁▁▁ |
| quantity |
0 |
1 |
3.37 |
9.95 |
1 |
1 |
2 |
4 |
3064 |
▇▁▁▁▁ |
Colors
Data summary
| -1 |
[Unknown] |
0033B2 |
f |
| 0 |
Black |
05131D |
f |
| 1 |
Blue |
0055BF |
f |
| 2 |
Green |
237841 |
f |
| 3 |
Dark Turquoise |
008F9B |
f |
| 4 |
Red |
C91A09 |
f |
| Name |
colors_df |
| Number of rows |
263 |
| Number of columns |
4 |
| _______________________ |
|
| Column type frequency: |
|
| character |
3 |
| numeric |
1 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| name |
0 |
1 |
3 |
28 |
0 |
263 |
0 |
| rgb |
0 |
1 |
6 |
6 |
0 |
223 |
0 |
| is_trans |
0 |
1 |
1 |
1 |
0 |
2 |
0 |
Variable type: numeric
| id |
0 |
1 |
651.38 |
750.55 |
-1 |
83 |
1005 |
1070.5 |
9999 |
▇▁▁▁▁ |
Parts
Data summary
| 003381 |
Sticker Sheet for Set 663-1 |
58 |
Plastic |
| 003383 |
Sticker Sheet for Sets 618-1, 628-2 |
58 |
Plastic |
| 003402 |
Sticker Sheet for Sets 310-3, 311-1, 312-3 |
58 |
Plastic |
| 003429 |
Sticker Sheet for Set 1550-1 |
58 |
Plastic |
| 003432 |
Sticker Sheet for Sets 357-1, 355-1, 940-1 |
58 |
Plastic |
| 003434 |
Sticker Sheet for Set 575-2, 653-1, 460-1 |
58 |
Plastic |
| Name |
parts_df |
| Number of rows |
52615 |
| Number of columns |
4 |
| _______________________ |
|
| Column type frequency: |
|
| character |
3 |
| numeric |
1 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| part_num |
0 |
1 |
1 |
20 |
0 |
52615 |
0 |
| name |
0 |
1 |
3 |
222 |
0 |
52103 |
0 |
| part_material |
0 |
1 |
4 |
16 |
0 |
7 |
0 |
Variable type: numeric
| part_cat_id |
0 |
1 |
38.91 |
22.08 |
1 |
17 |
41 |
60 |
68 |
▃▃▂▁▇ |
Elements
Data summary
| 6443403 |
2277c01pr0009 |
1 |
2277 |
| 6300211 |
67906c01 |
14 |
67908 |
| 4566309 |
2564 |
0 |
2564 |
| 4275423 |
53657 |
1004 |
53657 |
| 6194308 |
92926 |
71 |
28967 |
| 6229123 |
26561 |
4 |
26561 |
| Name |
elements_df |
| Number of rows |
84138 |
| Number of columns |
4 |
| _______________________ |
|
| Column type frequency: |
|
| character |
1 |
| numeric |
3 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| part_num |
0 |
1 |
2 |
19 |
0 |
33765 |
0 |
Variable type: numeric
| element_id |
0 |
1.00 |
5222065.12 |
1596842.63 |
9327 |
4259773.50 |
6057754 |
6262024.5 |
61532443 |
▇▁▁▁▁ |
| color_id |
0 |
1.00 |
539.67 |
2044.86 |
-1 |
8.00 |
28 |
135.0 |
9999 |
▇▁▁▁▁ |
| design_id |
23682 |
0.72 |
45569.87 |
30750.66 |
1001 |
18453.75 |
41748 |
75474.5 |
107520 |
▇▆▅▅▃ |
Part Relationships
Data summary
| P |
3626cpr3662 |
3626c |
| P |
87079pr9974 |
87079 |
| P |
3960pr9971 |
3960 |
| R |
98653pr0003 |
98086pr0003 |
| R |
98653pr0003 |
98088pat0003 |
| R |
98653pr0003 |
98089pat0003 |
| Name |
part_relationships_df |
| Number of rows |
29977 |
| Number of columns |
3 |
| _______________________ |
|
| Column type frequency: |
|
| character |
3 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| rel_type |
0 |
1 |
1 |
1 |
0 |
6 |
0 |
| child_part_num |
0 |
1 |
1 |
20 |
0 |
27139 |
0 |
| parent_part_num |
0 |
1 |
1 |
19 |
0 |
4725 |
0 |
Part categories
Data summary
| 1 |
Baseplates |
| 3 |
Bricks Sloped |
| 4 |
Duplo, Quatro and Primo |
| 5 |
Bricks Special |
| 6 |
Bricks Wedged |
| 7 |
Containers |
| Name |
part_categories_df |
| Number of rows |
66 |
| Number of columns |
2 |
| _______________________ |
|
| Column type frequency: |
|
| character |
1 |
| numeric |
1 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
Variable type: numeric
| id |
0 |
1 |
35.36 |
19.41 |
1 |
19.25 |
35.5 |
51.75 |
68 |
▇▇▇▇▇ |
Dataframes size
All rows in dataframes: 1446639
All columns in dataframes: 45
All values in dataframes: 8099232
Analysis deep dive
Themes
Themes sum of rows with NA value
## [1] 145
All NA values are in parent_id column
## # A tibble: 1 × 3
## id name parent_id
## <int> <int> <int>
## 1 0 0 145
Create themes_parent_df
## # A tibble: 145 × 2
## id name
## <int> <chr>
## 1 1 Technic
## 2 22 Creator
## 3 34 Make & Create
## 4 50 Town
## 5 52 City
## 6 112 Racers
## 7 126 Space
## 8 147 Pirates
## 9 155 Modular Buildings
## 10 158 Star Wars
## # ℹ 135 more rows
Join themes_parent_df with themes_df, insert name_child into
name_parent if name_parent is NA
## # A tibble: 468 × 3
## id name_child name_parent
## <int> <chr> <chr>
## 1 1 Technic Technic
## 2 3 Competition Technic
## 3 4 Expert Builder Technic
## 4 16 RoboRiders Technic
## 5 17 Speed Slammers Technic
## 6 18 Star Wars Technic
## 7 19 Supplemental Technic
## 8 20 Throwbot Slizer Technic
## 9 21 Universal Building Set Technic
## 10 22 Creator Creator
## # ℹ 458 more rows
Sets
Sets sum of rows with NA value
## [1] 0
Correlation between year and num_parts

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Join sets with prepared themes
## # A tibble: 21,880 × 4
## year num_parts name_child name_parent
## <int> <int> <chr> <chr>
## 1 1965 43 Technic Technic
## 2 1979 12 Classic Town Town
## 3 1987 0 Lion Knights Castle
## 4 1979 12 Supplemental Space
## 5 1979 12 Supplemental Space
## 6 1979 2 Supplemental Space
## 7 1979 18 Supplemental Space
## 8 1979 15 Classic Castle Castle
## 9 1965 3 Technic Technic
## 10 2013 4 Books Books
## # ℹ 21,870 more rows
156 Parent themes with number of sets and sum of parts
## # A tibble: 156 × 3
## name_parent count_sets sum_parts
## <chr> <int> <int>
## 1 4 Juniors 54 2317
## 2 Adventurers 83 8458
## 3 Agents 28 11970
## 4 Alpha Team 31 4313
## 5 Angry Birds 6 2388
## 6 Animal Crossing 5 0
## 7 Aquazone 37 6577
## 8 Architecture 60 39346
## 9 Atlantis 24 5323
## 10 Avatar 9 5522
## # ℹ 146 more rows
Top 10 themes based on number of sets
## # A tibble: 10 × 3
## name_parent count_sets sum_parts
## <chr> <int> <int>
## 1 Gear 3265 8653
## 2 Duplo 1293 45232
## 3 Books 981 8007
## 4 Star Wars 948 348777
## 5 City 870 183107
## 6 Collectible Minifigures 858 6025
## 7 Technic 804 272966
## 8 Educational and Dacta 678 127674
## 9 Town 677 84587
## 10 Friends 561 121489

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
More analysis (interactive + animations)
Trends
Forecasting